Name: Makesh Srinivasan
Registration number: 19BCE1717
Course code: CSE4020
Faculty: Dr. Abdul Quadir
Slot: L31 + L32
Date: 15-November-2021 Monday
Instructions:
Use a dataset to perfrom Kfold cross validation and generate an ensemble with KNN and Naive Bayes as first layer classifiers and Logistic regression as second layer classifier.
Load the Iris dataset using the URL 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'. The Iris dataset consists of 3 classes each with fifty instances. For the sake of simplicity, only the first 100 containing 2 classes of species are used in this exercise
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from numpy.random import seed
from numpy.random import randint
import plotly.express as px
import plotly.graph_objs as go
import plotly
def load(URL_):
data = pd.read_csv(URL_, header = None)
data = data[:100]
# the first 100 rows contains 2 Iris species
data[4] = np.where(data.iloc[:, -1]=='Iris-setosa', 0, 1)
print("Data loaded...")
# Only 3 features are taken so that we can visualise the classification process
data = data.drop(3, axis=1)
data = data.rename({0: 'SepalLengthCm', 1: 'SepalWidthCm', 2: 'PetalLengthCm', 4:'Species'}, axis=1)
return data
data = load('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data')
data.head(3)
Data loaded...
| SepalLengthCm | SepalWidthCm | PetalLengthCm | Species | |
|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0 |
| 1 | 4.9 | 3.0 | 1.4 | 0 |
| 2 | 4.7 | 3.2 | 1.3 | 0 |
X=data.drop("Species",axis=1)
y=data["Species"]
Data visualisation:
buffer = data.Species.tolist()
Species = []
for i in range(len(buffer)):
if buffer[i] == 1:
Species.append("green")
else:
Species.append("blue")
trace1 = go.Scatter3d(x=data.SepalLengthCm.tolist(), y=data.SepalWidthCm.tolist(), z=data.PetalLengthCm.tolist(), mode='markers',marker = dict(color = Species),name = "Dataset")
fig = go.Figure(data=[trace1])
fig.update_layout(scene = dict(xaxis_title='SepalLengthCm',yaxis_title='SepalWidthCm',zaxis_title='PetalLengthCm'))
plotly.offline.iplot(fig, filename='simple-3d-scatter')
# green -> Iris-versicolor
# blue -> Iris-setosa
KNN functions:
def distance(c1, c2):
r = 0
for i in range(len(c1)):
r+=(c1[i]-c2[i])**2
return np.sqrt(r)
def accuracy(p,y):
if(len(p)!=len(y)):
print("Number of X records and y targets are not equal!")
return
crct=0
for i in range(len(y)):
if(y[i]==p[i]):
crct+=1
return crct/len(y)
def KNN_3D_visualise(test_data, k_nearest_neighbours, dataset):
# Data points
x,y,z,Species = [],[],[],[]
x.extend(dataset.SepalLengthCm.tolist())
y.extend(dataset.SepalWidthCm.tolist())
z.extend(dataset.PetalLengthCm.tolist())
buffer = dataset.Species.tolist()
for i in range(len(buffer)):
if buffer[i] == 1:
Species.append("green")
else:
Species.append("blue")
trace1 = go.Scatter3d(
x=x,
y=y,
z=z,
mode='markers',
marker = dict(color = Species, opacity = 0.3),
name = "Translucent: Data points"
)
# Nearest neighbours
x,y,z,Species = [],[],[],[]
for i in range(0, len(k_nearest_neighbours)):
x.append(k_nearest_neighbours[i][0][0])
y.append(k_nearest_neighbours[i][0][1])
z.append(k_nearest_neighbours[i][0][2])
if k_nearest_neighbours[i][0][3] == 1:
Species.append("green")
else:
Species.append("blue")
trace2 = go.Scatter3d(
x=x,
y=y,
z=z,
mode='markers',
marker = dict(color = Species),
name = "SOLID: Nearest neighbours"
)
# Test data
x = [test_data[0]]
y = [test_data[1]]
z = [test_data[2]]
Species = [9]
trace0 = go.Scatter3d(
x=x,
y=y,
z=z,
mode='markers',
marker = dict(color = "red"),
name='Test data'
)
# Euclidean distance
x_lines = list()
y_lines = list()
z_lines = list()
for i in range(0, len(k_nearest_neighbours)):
x_lines.append(k_nearest_neighbours[i][0][0])
y_lines.append(k_nearest_neighbours[i][0][1])
z_lines.append(k_nearest_neighbours[i][0][2])
x_lines.append(test_data[0])
y_lines.append(test_data[1])
z_lines.append(test_data[2])
x_lines.append(None)
y_lines.append(None)
z_lines.append(None)
trace3 = go.Scatter3d(
x=x_lines,
y=y_lines,
z=z_lines,
mode='lines',
name='Euclidean distance'
)
fig = go.Figure(data=[trace0, trace1, trace2, trace3])
fig.update_layout(scene = dict(
xaxis_title='SepalLengthCm',
yaxis_title='SepalWidthCm',
zaxis_title='PetalLengthCm'))
plotly.offline.iplot(fig, filename='simple-3d-scatter')
def nearest_neighbours(test_data, train_data):
# Euclidean distance between the test tuple and each entry in the data
closest_neighbours = list()
for i in range(len(train_data)):
closest_neighbours.append((train_data[i], distance(test_data, train_data[i])))
# return the neighbours list sorted in ascending order of distance
closest_neighbours.sort(key=lambda li: li[1])
return closest_neighbours #[[f1,f2,label], distance]
def load_train_data(df):
return df.values.tolist()
def KNN_prediction(test_data, dataset, k, visualise):
train_data = load_train_data(dataset)
# Get the nearest neighbours
k_nearest_neighbours = nearest_neighbours(test_data, train_data)[0:k]
# Visualise using the plot (KNN):
if visualise:
KNN_3D_visualise(test_data, k_nearest_neighbours, dataset)
# Print the neighbours
print(f"The {k} nearest neighbours are: \n")
label_class = []
for i in range(len(k_nearest_neighbours)):
if visualise:
print(f"Neighbour = {k_nearest_neighbours[i][0]} \t Euclidean distance = {k_nearest_neighbours[i][1]}")
label_class.append(k_nearest_neighbours[i][0][-1])
# Show the predicted class based on max() occurance in the nearest neighbours
predicted_class = max(set(label_class), key=label_class.count)
if visualise:
print(f"\nThe {k} nearest neighbours' classes are: ")
print(label_class)
print(f"\nPrediction class = {predicted_class}")
return predicted_class
K-fold cross validation using KNN:
Assumptions/given:
The number of folds is set to 10
K value in KNN is set to 3
from sklearn.model_selection import KFold
fold = 10
kf = KFold(n_splits=fold)
k = 3
c = 1
average_accuracy = 0
print("Cross validation accuracy measures: ")
for train,test in kf.split(X):
X_train, X_test = X.values[train], X.values[test]
y_train, y_test = y.values[train], y.values[test]
pred = []
for test_data in X_test:
pred.append(KNN_prediction(test_data, data, k, visualise = False))
print(f"Fold {int(c)}) accuracy = {accuracy(pred, y_test)}")
average_accuracy += accuracy(pred, y_test)
c+=1
print("\nAverage accuracy = ", average_accuracy/fold)
Cross validation accuracy measures: Fold 1) accuracy = 1.0 Fold 2) accuracy = 1.0 Fold 3) accuracy = 1.0 Fold 4) accuracy = 1.0 Fold 5) accuracy = 1.0 Fold 6) accuracy = 1.0 Fold 7) accuracy = 1.0 Fold 8) accuracy = 1.0 Fold 9) accuracy = 1.0 Fold 10) accuracy = 1.0 Average accuracy = 1.0
10 fold cross validation using Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
gnb = GaussianNB()
cv_scores = cross_val_score(gnb, X, y, cv=10)
print("Cross validation accuracy measures = ", cv_scores)
print("Average accuracy: ", np.mean(cv_scores))
Cross validation accuracy measures = [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] Average accuracy: 1.0
The stacking is done using the above two classifiers (KNN and Naive Bayes) with the meta classifier as Logistic regression
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
gnb = GaussianNB()
The objects of the two classifiers are created
Create an ensemble model using the estimators generated above with logistic regressor as the second level classifier:
est = [('knn', KNeighborsClassifier(n_neighbors=3)), ('nb', GaussianNB())]
clf = StackingClassifier(estimators = est, final_estimator = LogisticRegression())
Ensemble Kfold cross validation:
The 10 fold cross validation of the ensemble classifier is done below.
cv_scores = cross_val_score(clf, X, y, cv=10)
print('Cross validation accuracy measures = ', cv_scores)
print('Average accuracy = ', np.mean(cv_scores))
Cross validation accuracy measures = [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] Average accuracy = 1.0
The two individual classifiers (KNN and Naive Bayes) gave an overall average accuracy of 1.00. This means, the ensemble of the two using logistic regression must be greater than or equal to 1.00, which is the case as shown above.
test_data = [6,3,2]
KNN (individual prediction)
k_values = [3,5,8]
for k in k_values:
print(f"K = {k}:\n")
prediction_knn = KNN_prediction(test_data, data, k, visualise = True)
print(f"The class predicted for the data {test_data} is {prediction_knn}", end="")
if prediction_knn == 1:
print(" (Iris-Versicolor)")
else:
print(" (Iris-Setosa)")
print("_"*100)
K = 3:
The 3 nearest neighbours are: Neighbour = [5.4, 3.4, 1.7, 0.0] Euclidean distance = 0.7810249675906652 Neighbour = [5.4, 3.4, 1.5, 0.0] Euclidean distance = 0.877496438739212 Neighbour = [5.7, 3.8, 1.7, 0.0] Euclidean distance = 0.9055385138137414 The 3 nearest neighbours' classes are: [0.0, 0.0, 0.0] Prediction class = 0.0 The class predicted for the data [6, 3, 2] is 0.0 (Iris-Setosa) ____________________________________________________________________________________________________ K = 5:
The 5 nearest neighbours are: Neighbour = [5.4, 3.4, 1.7, 0.0] Euclidean distance = 0.7810249675906652 Neighbour = [5.4, 3.4, 1.5, 0.0] Euclidean distance = 0.877496438739212 Neighbour = [5.7, 3.8, 1.7, 0.0] Euclidean distance = 0.9055385138137414 Neighbour = [5.5, 3.5, 1.3, 0.0] Euclidean distance = 0.99498743710662 Neighbour = [5.1, 3.3, 1.7, 0.0] Euclidean distance = 0.9949874371066202 The 5 nearest neighbours' classes are: [0.0, 0.0, 0.0, 0.0, 0.0] Prediction class = 0.0 The class predicted for the data [6, 3, 2] is 0.0 (Iris-Setosa) ____________________________________________________________________________________________________ K = 8:
The 8 nearest neighbours are: Neighbour = [5.4, 3.4, 1.7, 0.0] Euclidean distance = 0.7810249675906652 Neighbour = [5.4, 3.4, 1.5, 0.0] Euclidean distance = 0.877496438739212 Neighbour = [5.7, 3.8, 1.7, 0.0] Euclidean distance = 0.9055385138137414 Neighbour = [5.5, 3.5, 1.3, 0.0] Euclidean distance = 0.99498743710662 Neighbour = [5.1, 3.3, 1.7, 0.0] Euclidean distance = 0.9949874371066202 Neighbour = [5.4, 3.7, 1.5, 0.0] Euclidean distance = 1.0488088481701514 Neighbour = [5.2, 3.5, 1.5, 0.0] Euclidean distance = 1.067707825203131 Neighbour = [5.0, 3.0, 1.6, 0.0] Euclidean distance = 1.0770329614269007 The 8 nearest neighbours' classes are: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] Prediction class = 0.0 The class predicted for the data [6, 3, 2] is 0.0 (Iris-Setosa) ____________________________________________________________________________________________________
NOTE: for all values of K the predictions are the same - Iris-setosa (0)
Naive Bayes (individual prediction)
gnb.fit(X, y)
prediction_gnb = gnb.predict([test_data])
print("Prediction using Naive Bayes = ", prediction_gnb[0])
if prediction_gnb == 1:
print("(Iris-Versicolor)")
else:
print("(Iris-Setosa)")
Prediction using Naive Bayes = 0 (Iris-Setosa)
Ensemble stack (overall prediction)
clf.fit(X,y)
prediction_clf = clf.predict([test_data])
print("Prediction using the ensemble = ", prediction_gnb[0])
if prediction_gnb == 1:
print("(Iris-Versicolor)")
else:
print("(Iris-Setosa)")
Prediction using the ensemble = 0 (Iris-Setosa)
Predictions on the test data:
1) KNN: Iris-Setosa
2) Naive Bayes: Iris-Setosa
3) Ensemble: Iris-Setosa
The Ensemble prediction is Iris-setosa (0) and this seems to be the true value from the visualisation shown in KNN above.
Therefore, we can conclude that the ensemble performs equally as good as the individual classifiers or better.